# Load common libraries
import matplotlib as plt # visualize data graphically
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd # To easily grap and transform data
from scipy import stats # To interpret output
#import statsmodels.api as sm
# Load revised demosgrahic data with the age grouping and amount grouping
# path = r"C:\USers\cthian972\Documents\data_anlaytics_class\class_exercise\Class2\demorgraphic_data_age_amount_grouping.csv"
# data = pd.read_csv(path)
data = pd.read_csv('demorgraphic_data_age_amount_grouping.csv')
df = pd.DataFrame(data)
df
# check data types, in this case everything looks fine no changes need to be made.
df.info()
# note: age_g1 is age grouping: age<=24 is 1, 24<age<=34 is 2, 34<age<=44 is 3, 44<age<=54 is 4, 54<age<=64 is 5,
# 64<age<=74 is 6, age>74 is 7
# age_g2 is age grouping: age<=24 is 1, 24<age<=49 is 2, 50<age<=65 is 3, age>65 is 4
# age_gtext is age grouping: age<=27 is 1, 27<age<=55 is 2, age>55 is 3
# age_g3 is age grouping: age<=55 is 1, age>55 is 2
#
# note: amt_g1 is amount grouping: amt<=500 is 5, 500<amt<=1000 is 10, 1000<amt<=2000 is 20, amt>2000 is 30
# amt_g2 is amount grouping: amt<=50 is 0.5, 50<amt<=250 is 2.5, 250<amt<=500 is 5, 500<amt<=1000 is 10,
# 1000<amt<=1500 is 15, 1500<amt<=2000 is 20, amt>2000 is 30
# amt_gtext is amount grouping: amt<=500 is 5, 500<amt<=1000 is 10, amt>1000 is 30
#check for missing values
display(df.isna().any())
#drop any missing values
# df = df.dropna()
# Checking for duplicate rows. Remove them before applying Machine learning algorithm
df = df.drop_duplicates()
df
# use describe() method to create a statistical summary to help describe the dataset
# we want to see all the data falls within 3 standard deviations from the mean, checking for outliers
df.describe()
#create a reduced columns based on original dataset
original_columns = ['instore','age','items','amt','region']
df_reduced = df[original_columns]
df_reduced
# Build a Scatter Plot X & y data
#X = df['amt']
#y = df['age']
%matplotlib inline
# create the scatter plot
sns.pairplot(df_reduced, hue='region', size=1.5);
#make sure it's formatted
#plt.title('amt vs age')
#plt.xlablel("Amount")
#plt.ylabel("Age")
#plt.legel()
#plt.show
from the plots, we can see that bigger age span online than in-store, both online and instore sold no more than 8 items, online sales have greater amount span compare to in-store. region 2 have older buyers (both height and width), region4 have high spenders, the older folks in region spends less than 1000, Region 2 older than 75years old spent less than 500. Most people buy 2 to 7 items, less proportion of people buying just 1 or 8 items. in the last plot, we can see region 2 are highly contentrated from abt 0 to 500. 2 levels for the other regions: 1000 is a breaking point for R1 and R3, and 2000 a breaking point for R4.
# Create boxplot by instore
sns.set_style("whitegrid")
sns.boxplot(y = 'age', x = 'instore', data = df)
Looking at the boxplots, the online have bigger age span, and the median is higher than in-store's median. With big overlap, we cannot conclude that the customers shop in-store are older than online shoppers.
# split amount data by instore
df_instore = df_reduced[df.instore > 0.5]
df_online = df_reduced[df.instore < 0.5]
#summary data for instore
df_instore.describe()
#summary data for online
df_online.describe()
# ony consider region 3 & 4 for they have both online and instore
df_3_4 = df_reduced[df.region > 2]
sns.set_style("whitegrid")
sns.boxplot(y = 'age', x = 'instore', data = df_3_4)
Focusing on regions 3 and 4, the online buyers do not have buyers over 63years old. However, because of the IQRs are so close, we cannot say that there is any significant age different in the 2 groups
# split amount data by instore and region 3 or 4
df_instore_3_4 = df_3_4[df.instore > 0.5]
df_online_3_4 = df_3_4[df.instore < 0.5]
#summary data for instore
df_instore_3_4.describe()
#summary data for online
df_online_3_4.describe()
# Strength of correlations:
# Very strong relationship (|r| >= 0.8)
# Strong relationship (|r| >= 0.6)
# Moderate relationship (|r| >= 0.4)
# Weak relationship (|r| >= 0.2)
# Very weak relationship (0< |r|< 0.2)
# Create Pearson correlation matrix (corr_mat) to identify which features are correlated
# and which will have more impact on the target column than some others
corr_mat = df_reduced.corr()
print(corr_mat)
age and amount, region have weak negative correlation,age and instore have very weak correlation, age and items are not correlated
# Create Pearson correlation matrix visualization, and limit to 2 decimal places:
corr_mat.style.background_gradient(cmap='coolwarm').set_precision(2)
# See the number of transaction by the type of shopping
df["instore"].value_counts()
# split amount data by the regions 1,2,3,4
df1 = df[df.region < 2] # region 1
df2 = df[(df.region > 1) & (df.region < 3)] # region 2
df3 = df[(df.region > 2) & (df.region < 4) ] # region 3
df4 = df[df.region > 3]# region 4
df3_0 = df[(df.region > 2) & (df.region < 4) & (df.instore < 1)] # region 3 online
df3_1 = df[(df.region > 2) & (df.region < 4) & (df.instore > 0)] # region 3 in-store
df4_0 = df[(df.region > 3) & (df.instore < 1)] # region 4 online
df4_1 = df[(df.region > 3) & (df.instore > 0)] # region 4 in-store
# Pearson Correlation matrix for region 1
new_columns = ['instore','age','items','amt']
df1_reduced = df1[new_columns]
corr_mat1 = df1_reduced.corr()
corr_mat1.style.background_gradient(cmap='coolwarm').set_precision(2)
age and amount have weak negative correlation, age and items are not correlated
# Pearson Correlation matrix for region 2
df2_reduced = df2[new_columns]
corr_mat2 = df2_reduced.corr()
corr_mat2.style.background_gradient(cmap='coolwarm').set_precision(2)
age and items, age and amount are not correlated
# Pearson Correlation matrix for region 3
df3_reduced = df3[new_columns]
corr_mat3 = df3_reduced.corr()
corr_mat3.style.background_gradient(cmap='coolwarm').set_precision(2)
amount is (moderate) negatively correlated with instore
age is weakly correlated with instore, and amount. no correlation with items
# Pearson Correlation matrix for region 3 online
newer_columns = ['age','items','amt']
df3_0_reduced = df3_0[newer_columns]
corr_mat3_0 = df3_0_reduced.corr()
corr_mat3_0.style.background_gradient(cmap='coolwarm').set_precision(2)
no correlation
# Pearson Correlation matrix for region 3 in-store
df3_1_reduced = df3_1[newer_columns]
corr_mat3_1 = df3_1_reduced.corr()
corr_mat3_1.style.background_gradient(cmap='coolwarm').set_precision(2)
no linear correlation between items and age. Very weak negative correlation between amt and age.
# Pearson Correlation matrix for region 4
df4_reduced = df4[new_columns]
corr_mat4 = df4_reduced.corr()
corr_mat4.style.background_gradient(cmap='coolwarm').set_precision(2)
instore and amount have weak negative correlation, age and amt have very weak postitive correlatiob, age and instore have very weak correlation. items is not correlated with instore, age,or amount
# Pearson Correlation matrix for region 4 online
df4_0_reduced = df4_0[newer_columns]
corr_mat4_0 = df4_0_reduced.corr()
corr_mat4_0.style.background_gradient(cmap='coolwarm').set_precision(2)
no correlation
# Pearson Correlation matrix for region 4 in-store
# Pearson Correlation matrix for region 3 in-store
df4_1_reduced = df4_1[newer_columns]
corr_mat4_1 = df4_1_reduced.corr()
corr_mat4_1.style.background_gradient(cmap='coolwarm').set_precision(2)
no linear correlation
#import the linear regression class:
from sklearn.linear_model import LinearRegression
#nstantiate LinearRegression class and specify that we would like to fit the intercept using fit_intercept hyperparameter:
model = LinearRegression(fit_intercept=True)
model
# Arrange data into a features matrix and target
# we want to look at region 1 only, because the correlation matrix show some weak correlation between age and amt
X_regr = df[['age'],['instore']]
X_regr.shape
#identify target array, y
y = df1[['amt']]
y.shape
# Use fit() method to fit the model
# This fit() command causes a number of model-dependent internal computations to take place, and the results of these
# computations are stored in model-specific attributes that the user can explore.
model.fit(X_regr, y)
# In Scikit-Learn, by convention all model parameters that were learned during the fit() process have trailing underscores
# The coefficients
model.coef_
# y-intercept
model.intercept_
For Region 1, the fitted model is amt = 1080.24 - 7.67age
# Once the model is trained, the main task of supervised machine learning is to evaluate it based on what it says about new
# data that was not part of the training set. In Scikit-Learn, this can be done using the predict() method.
# For the sake of this example, our "new data" will be a grid of x values, and we will ask what y values the model predicts:
# coerce these x values into a [n_samples, n_features]
xfit = np.linspace(18, 64)
Xfit = xfit[:, np.newaxis]
yfit = model.predict(Xfit)
# Visualize the results by plotting first the raw data, and then this model fit:
# sns.set_style('whitegrid')
# sns.lmplot(xfit, yfit, data = df1 )
# Leverage the built in machine learning Models in the libraries
from sklearn.tree import DecisionTreeClassifier # A Decision Tree Classifier
from sklearn.model_selection import train_test_split # Split arrays/matrices into random train and test subsets
from sklearn import linear_model # Import linear
#from sklearn.linear_model import LinearRegression # Import linear model
from sklearn.model_selection import cross_val_score # Import cross_val_score function
from sklearn.metrics import accuracy_score #Accuracy classification score.(best performance =1)
from sklearn.metrics import confusion_matrix # describe performance of a classification model, visualization algorithm performance
from sklearn.metrics import classification_report #A text report shows main classification metrics
from sklearn.ensemble import RandomForestClassifier # A random forest is a meta estimator that fits a number of
# decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive
# accuracy and control over-fitting. The sub-sample size is always the same as the original input sample size
# but the samples aredrawn with replacement if bootstrap=True (default).
from sklearn.ensemble import GradientBoostingClassifier #GB builds an additive model in a forward stage-wise
# fashion allows fowr the optimization of arbitrary differentiable loss functions. In each stage
# n_classes_ regression trees are fit on the negative gradient of the binomial or multinomial deviance
# loss function. Binary classification is a special case where only a single regression tree is induced.
from sklearn.tree import export_graphviz # Export decision tree in DOT format
# Slicing using the [ ] operator selects a set of rows and/or columns from a DataFrame
# To slice out a set of rows, use the following syntax: data[start:stop].
# To select any given column, we can use Pandas to select a column by its name
# variable_name = dataframe['columnName']
# split dataset in Features and Target variable
# select features to create feature matrix {n samples, m columns|
# The features (columns) always refer to the distinct observations that describe each sample in a quantitative manner.
# Features are generally real-valued, but may be Boolean or discrete-valued in some cases.
# X = df.iloc[:,0:4]
feature_cols = ['instore','amt','region','items']
X = df[feature_cols] #Features
print('Summary of feature sample')
X.head()
# Select the Dependent Variable or Target Array
# Target array by convention we will usually call y. The target array is usually one dimensional, with length n_samples,
# and is generally contained in a NumPy array or Pandas Series. The target array may have continuous numerical values, or
# discrete classes/labels. While some Scikit-Learn estimators do handle multiple target values in the form of a 2-dimensional,
# [n_samples, n_targets] target array, we will primarily be working with the common case of a one-dimensional target array.
# (Hint: Investigate the Relationship Between the Region of Purchase and a Customer's Age):
y = df.age
In order to do this efficiently, create a copy and store each instance we've already imported in a list. Create an empty list and append it with both algorithms as follows:
algos_Class = [] # empty list
algos_Class.append(('Random Forest Classifier = ', RandomForestClassifier()))
algos_Class.append(('Decision Tree Classifier = ', DecisionTreeClassifier()))
To build and assess both models, we create an empty list to store the results and another to hold the name of each algorithm so we can easily print out the results and keep them separated as follows:
# classification
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X, y, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
only 2% accuracy. Very bad
# use age_g1 for target
y1 = df.age_g1
# classification 3
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X, y1, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
slight improvement to the accuracy scores to 21%
# replace amt by amt_g1 from features
feature_cols = ['instore','region','amt_g1']
X1 = df[feature_cols] #Features
# classification 4
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X1, y1, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
slight improvement to the accuracy scores to 24%
# replace amt_g1 by amt_g2 from features
feature_cols = ['instore','region','amt_g2']
X2 = df[feature_cols] #Feature
# classification 5
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X2, y1, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
insignificant improvement
# use age_g2
y2 = df.age_g2
# classification 6
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X2, y2, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
significant improvement (less target bins seems to improve predictions) to 53%
# use age_gtext
y3 = df.age_gtext
# classification 7
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X2, y3, cv = 5, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
reduce the target to 3 bins, improve the predictions to 62%
# classification 8
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X1, y3, cv = 5, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
Fixed y3, changed x with amt_g1, amt_g2, amt_gtext, same results. That is to improve the prediction, we will have to reduce target bins to 2.
# use age_g3
y4 = df.age_g3
# classification 8
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X2, y4, cv = 5, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
accuracy improved to 75%
Now that cross validation has been used for choosing a model, it is time to proceed with training the model and assessing its performance. Since this is a classification problem we will specify three different performance metrics to assess the performance of the model and the accuracy of its predictions. The three metrics are Accuracy, Weighted Mean Recall, Weighted Mean Precision
# Step 1:
# Train/Test Split: splitting data into 70% for training and 30% for testing and ensuring the data
# is randomly stratified to prevent and bias from occurring due to any ordering of the data.
X_train, X_test, y_train, y_test = train_test_split(X2, y4, test_size = .30, random_state = 123)
# Step 2:
# instantiate the algorithm just as we did previously, and fit or apply the algorithm
# to our training sets to build the model as follows:
# Modeling (Classification)
algo = DecisionTreeClassifier()
model = algo.fit(X_train,y_train)
# Step 3:
# Use the fully trained model on the testing set to make predictions before it can be assessed:
# Predictions
preds = model.predict(X_test)
# Step 4:
# Print out the classification report using the predictions just made and the ground truth or
# the actual values from the testing set to assess the model with a given metric as follows
#
# Precision is the ability of a classiifer not to label an instance positive that is actually negative.
# For each class it is defined as as the ratio of true positives to the sum of true and false positives.
# Said another way, βfor all instances classified positive, what percent was correct?β
#
# Recall is the ability of a classifier to find all positive instances.
# For each class it is defined as the ratio of true positives to the sum of true positives and false negatives.
# Said another way, βfor all instances that were actually positive, what percent was classified correctly?β
#
# The F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0.
# Generally speaking, F1 scores are lower than accuracy measures as they embed precision and recall into their computation.
# As a rule of thumb, the weighted average of F1 should be used to compare classifier models, not global accuracy.
#
# Support is the number of actual occurrences of the class in the specified dataset. Imbalanced support in the training data
# may indicate structural weaknesses in the reported scores of the classifier and could indicate the need for stratified
# sampling or rebalancing. Support doesnβt change between models but instead diagnoses the evaluation process.
#
# High recall, low precision: This means that most of the positive examples are correctly recognized (low FN) but there
# are a lot of false positives.
#
# Low recall, high precision: This shows that we miss a lot of positive examples (high FN) but those we predict as
# positive are indeed positive (low FP)
#
print(classification_report(y_test, preds))
In the classification report, for all identifed as age<= 55, 82% are correctly identified. (precision). For all instances that were actually age <=55, 84% was classified correctly (Recall). For the age > 55 group , both precision and recall are close to 50%. Which is not too good. Since there are more count in the age<=55 group, the weighted mean recall and weighted precision are close to 75%.
#print accuracy score
print('Accuracy Score: ',accuracy_score(y_test, preds))
The accuracy_score function is also incorporated in the classification report.
#output Confusion matrix
confusion_matrix(y_test, preds)
# Calculate Classification Rate/Accuracy:
# Classification Rate or Accuracy is given by the relation:
# accuracy = (TP + TN) /(TP + TN + FP + FN)
# However, there are problems with accuracy. It assumes equal costs for both kinds of errors.
# A 99% accuracy can be excellent, good, mediocre, poor or terrible depending upon the problem.
Acc= (14789+3226)/(14789+2867+3226+3112)
Acc
# Calculate Recall
# Recall = TP/(TP+TN)
Rec = 14789/(14789+3226)
Rec
# Step 5:
# To visualize the decision tree
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
age_values = ['<=55','>55'] # this is just a list specifying the region classes
dot_data = StringIO()
export_graphviz(model, out_file = dot_data, filled = True, rounded = True,
feature_names=X2.columns, class_names = region_values, label='all', precision = 1, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
#graph.write_png('tree.png')
Image(graph.create_png())
# Repeat steps 1-4: splitting data into 75% for training and 25% for testing and ensuring the data
X_train, X_test, y_train, y_test = train_test_split(X1, y4, test_size = .25, random_state = 123)
algo = DecisionTreeClassifier()
model = algo.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
dot_data = StringIO()
export_graphviz(model, out_file = dot_data, filled = True, rounded = True,
feature_names=X1.columns, class_names = region_values, label='all', precision = 1, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
# Step2 1-4: splitting data into 80% for training and 20% for testing and ensuring the data
X_train, X_test, y_train, y_test = train_test_split(X2, y4, test_size = .20, random_state = 35)
algo = DecisionTreeClassifier()
model = algo.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
dot_data = StringIO()
export_graphviz(model, out_file = dot_data, filled = True, rounded = True,
feature_names=X2.columns, class_names = region_values, label='all', precision = 1, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
# increase max_depth
model = DecisionTreeClassifier(max_depth=10)
model.fit(X2,y4)
model.score(X2,y4)
model.predict([[0,3,10]]) #prediction
model.predict([[1,3,30]]) #prediction
model.predict([[0,1,5]]) #prediction
#change "gini" to "enthropy"
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='enthropy',
max_depth=2, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='random')
model.score(X2,y4)
#Define Features and Target
feature_cols = ['age_g1','amt_g2','region']
X_instore1 = df[feature_cols] #Features
y_instore = df.instore
# classification
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X_instore1, y_instore, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
89% accuracy
# Repeat steps 1-4: splitting data into 75% for training and 25% for testing and ensuring the data
X_train, X_test, y_train, y_test = train_test_split(X_instore1, y_instore, test_size = .25, random_state = 123)
algo = DecisionTreeClassifier()
model = algo.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
instore_values = ['online','instore']
dot_data = StringIO()
export_graphviz(model, out_file = dot_data, filled = True, rounded = True,
feature_names=X_instore1.columns, class_names = instore_values, label='all', precision = 1, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
#graph.write_png('tree.png')
Image(graph.create_png())
graph.write_png('tree_instore.png')
# change to age_g3
feature_cols = ['age_g3','amt_g2','region']
X_instore2 = df[feature_cols] #Features
# classification
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X_instore2, y_instore, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
# change to age
feature_cols = ['age','amt_g2','region']
X_instore3 = df[feature_cols] #Features
# classification
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X_instore3, y_instore, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
# change to age and amt
feature_cols = ['age','amt','region']
X_instore4 = df[feature_cols] #Features
# classification
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X_instore4, y_instore, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
age is a better predictor compare to age grouping, but not so for amount grouping
# change to age and amt
feature_cols = ['age','amt_g1','region']
X_instore5 = df[feature_cols] #Features
# classification
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X_instore5, y_instore, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
accuracy improves slightly to 89%
# Repeat steps 1-4: splitting data into 75% for training and 25% for testing and ensuring the data
X_train, X_test, y_train, y_test = train_test_split(X_instore5, y_instore, test_size = .25, random_state = 123)
algo = DecisionTreeClassifier()
model = algo.fit(X_train,y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))
dot_data = StringIO()
export_graphviz(model, out_file = dot_data, filled = True, rounded = True,
feature_names=X_instore5.columns, class_names = instore_values, label='all', precision = 1, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
#graph.write_png('tree.png')
Image(graph.create_png())
# split dataset in Features and Target variable
# select features
# X = df.iloc[:,0:4]
feature_columns = ['instore','age','items','amt_g2']
X_region = df[feature_columns] #Features
y_region = df.region # Target variable
# classification
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X_region, y_region, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
# classification 2 try cv=5
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X_region, y_region, cv = 5, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
compare cv=3 and cv=5, only tiny improvement to the accuracy of both random forest and decision tree classifiers
# drop items from features, since "items" is uncorrelated to other variables
feature_columns = ['instore','age','amt_g2']
X_region1 = df[feature_columns] #Features
y_region = df.region # Target variable
# classification 3
results = []
names = []
for name, model in algos_Class:
accuracy1 = cross_val_score(model, X_region1, y_region, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
# replace age by age_g2, amt_g2 by amt_g1 from features
feature_cols = ['instore','age_g1','amt_g1']
X_region2 = df[feature_cols] #Features
# classification 4
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X_region2, y_region, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
# replace age by age_g1 from features
feature_cols = ['instore','age_g1','amt_g2']
X_region3 = df[feature_cols] #Feature
# classification 5
results = []
names = []
for name, model in algos_Class:
accuracy = cross_val_score(model, X_region3, y_region, cv = 3, scoring ='accuracy')
names.append(name)
results.append(accuracy)
for i in range(len(names)):
print(names[i], results[i].mean())
#train the model
X_train, X_test, y_train, y_test = train_test_split(X_region2, y_region, test_size = .30, random_state = 123)
algo = DecisionTreeClassifier()
model = algo.fit(X_train,y_train)
# Predictions
preds = model.predict(X_test)
print(classification_report(y_test, preds))
Region 2 has high precision and high recall. Region 4 has a recall rate of 72%, but 65% of precision. Region 3 has a low recall rate. the overall accuracy, weughted mean precision and recall are in the 64% area.
print(cross_val_score(model, X_region3, y_region))